# -*- coding: utf-8 -*-
import json

from scrapy.spiders import Spider

from PhenixScrapy.items import DmozItem


class DmozSpider(Spider):
    name = "dmoz"
    allowed_domains = ["dmoz.org"]
    start_urls = ["http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
                  "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"]

    def parse(self, response):
        """
        简单抓取单个网页，并保存到本地
        Terminal ---运行  scrapy crawl dmoz
        Terminal ---运行  scrapy crawl dmoz -o items.json   ---存储为json文件


         filename = response.url.split("/")[-2]
        with open(filename, 'wb') as f:
        f.write(json.dumps(response.body))
        :param response:
        :return:
        """
        for selector in response.xpath('//ul/li'):
            item = DmozItem()
            item['title'] = selector.xpath('a/text()').extract()
            item['link'] = selector.xpath('a/@href').extract()
            item['description'] = selector.xpath('text()').extract()
            yield item




